In [31]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
In [10]:
csv_file_path = 'ObesityDataSet_raw_and_data_sinthetic.csv'
csv_df = pd.read_csv(csv_file_path, delimiter=';')
df_csv = pd.DataFrame(csv_df)
df_csv.head()
Out[10]:
| Nombre | Apellido | Genero | Edad | Estatura | Peso | HistorialFamiliar | ConsumeCalorias | ComidasDiarias | AlimentosEntreComidas | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Wendy | Greene | Femenino | 21 | 1.62 | 64 | yes | no | 3 | Sometimes |
| 1 | Martha | Jones | Femenino | 21 | 1.52 | 56 | yes | no | 3 | Sometimes |
| 2 | David | Curtis | Masculino | 23 | 1.8 | 77 | yes | no | 3 | Sometimes |
| 3 | Crystal | Solis | Femenino | 27 | 1.8 | 87 | no | no | 3 | Sometimes |
| 4 | Stephanie | Vasquez | Femenino | 22 | 1.78 | 89.8 | no | no | 1 | Sometimes |
In [11]:
#Ingreso a la base de datos de Mysql
usuario = 'root'
contraseña = 'jona'
host = 'localhost'
puerto = '3306'
base_de_datos = 'obesidad' # Nombre de la base de datos
In [12]:
# Crear la cadena de conexión
conexion = f'mysql+mysqlconnector://{usuario}:{contraseña}@{host}:{puerto}/{base_de_datos}'
In [13]:
# Crear el motor de conexión
engine = create_engine(conexion)
In [14]:
# Leer datos de la tabla MySQL
tabla_mysql = 'obesidadsql' # Nombre de la tabla
mysql_df = pd.read_sql(tabla_mysql, con=engine)
df_sql = pd.DataFrame(mysql_df)
df_sql.head()
Out[14]:
| Nombre | Apellido | Fuma | Agua | ControlCalorias | FrecuenciaEjercicio | TiempoRecreacion | ConsumoAlcohol | MedioTransporte | Obesidad | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Wendy | Greene | no | 2 | no | 0 | 1 | no | Public_Transportation | Normal_Weight |
| 1 | Martha | Jones | yes | 3 | yes | 3 | 0 | Sometimes | Public_Transportation | Normal_Weight |
| 2 | David | Curtis | no | 2 | no | 2 | 1 | Frequently | Public_Transportation | Normal_Weight |
| 3 | Crystal | Solis | no | 2 | no | 2 | 0 | Frequently | Walking | Overweight_Level_I |
| 4 | Stephanie | Vasquez | no | 2 | no | 0 | 0 | Sometimes | Public_Transportation | Overweight_Level_II |
In [15]:
#Unir los DataFrames por las columnas 'nombre' y 'apellido'
df_unido = pd.merge(csv_df, mysql_df, on=['Nombre', 'Apellido'], how='inner')
df_unido.head()
Out[15]:
| Nombre | Apellido | Genero | Edad | Estatura | Peso | HistorialFamiliar | ConsumeCalorias | ComidasDiarias | AlimentosEntreComidas | Fuma | Agua | ControlCalorias | FrecuenciaEjercicio | TiempoRecreacion | ConsumoAlcohol | MedioTransporte | Obesidad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Wendy | Greene | Femenino | 21 | 1.62 | 64 | yes | no | 3 | Sometimes | no | 2 | no | 0 | 1 | no | Public_Transportation | Normal_Weight |
| 1 | Martha | Jones | Femenino | 21 | 1.52 | 56 | yes | no | 3 | Sometimes | yes | 3 | yes | 3 | 0 | Sometimes | Public_Transportation | Normal_Weight |
| 2 | David | Curtis | Masculino | 23 | 1.8 | 77 | yes | no | 3 | Sometimes | no | 2 | no | 2 | 1 | Frequently | Public_Transportation | Normal_Weight |
| 3 | Crystal | Solis | Femenino | 27 | 1.8 | 87 | no | no | 3 | Sometimes | no | 2 | no | 2 | 0 | Frequently | Walking | Overweight_Level_I |
| 4 | Stephanie | Vasquez | Femenino | 22 | 1.78 | 89.8 | no | no | 1 | Sometimes | no | 2 | no | 0 | 0 | Sometimes | Public_Transportation | Overweight_Level_II |
In [16]:
# Guardar el DataFrame unido en un archivo CSV
output_csv_path = 'salida.csv' #
df_unido.to_csv(output_csv_path, index=False)
In [21]:
# Define functions to calculate new columns
def calculate_bmi(weight, height):
return round(weight / (height ** 2), 2)
def categorize_bmi(bmi):
if bmi < 18.5:
return 'Underweight'
elif 18.5 <= bmi < 24.9:
return 'Normal weight'
elif 25 <= bmi < 29.9:
return 'Overweight'
elif 30 <= bmi < 34.9:
return 'Obesity Class I'
elif 35 <= bmi < 39.9:
return 'Obesity Class II'
else:
return 'Obesity Class III'
def evaluate_physical_activity(exercise_frequency):
if exercise_frequency == 0:
return 'Sedentary'
elif exercise_frequency <= 2:
return 'Moderately active'
else:
return 'Active'
def assess_health_risk(smokes, alcohol_consumption, family_history):
risk = 0
if smokes == 'yes':
risk += 1
if alcohol_consumption in ['Sometimes', 'Frequently']:
risk += 1
if family_history == 'yes':
risk += 1
if risk == 0:
return 'Low'
elif risk == 1:
return 'Moderate'
else:
return 'High'
def hydration_level(water_intake):
if water_intake >= 8:
return 'Well hydrated'
elif 4 <= water_intake < 8:
return 'Adequately hydrated'
else:
return 'Poorly hydrated'
# Add new columns to the DataFrame
df_unido['IMC'] = df_unido.apply(lambda row: calculate_bmi(row['Peso'], row['Estatura']), axis=1)
df_unido['CategoriaIMC'] = df_unido['IMC'].apply(categorize_bmi)
df_unido['ActividadFísica'] = df_unido['FrecuenciaEjercicio'].apply(evaluate_physical_activity)
df_unido['RiesgoSalud'] = df_unido.apply(lambda row: assess_health_risk(row['Fuma'], row['ConsumoAlcohol'], row['HistorialFamiliar']), axis=1)
df_unido['Hidratación'] = df_unido['Agua'].apply(hydration_level)
# Display the first few rows of the updated dataframe to confirm the new columns
df_unido.head()
Out[21]:
| Nombre | Apellido | Genero | Edad | Estatura | Peso | HistorialFamiliar | ConsumeCalorias | ComidasDiarias | AlimentosEntreComidas | ... | FrecuenciaEjercicio | TiempoRecreacion | ConsumoAlcohol | MedioTransporte | Obesidad | IMC | CategoriaIMC | ActividadFísica | RiesgoSalud | Hidratación | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Wendy | Greene | Femenino | 21 | 1.62 | 64.0 | yes | no | 3 | Sometimes | ... | 0 | 1 | no | Public_Transportation | Normal_Weight | 24.39 | Normal weight | Sedentary | Moderate | Poorly hydrated |
| 1 | Martha | Jones | Femenino | 21 | 1.52 | 56.0 | yes | no | 3 | Sometimes | ... | 3 | 0 | Sometimes | Public_Transportation | Normal_Weight | 24.24 | Normal weight | Active | High | Poorly hydrated |
| 2 | David | Curtis | Masculino | 23 | 1.80 | 77.0 | yes | no | 3 | Sometimes | ... | 2 | 1 | Frequently | Public_Transportation | Normal_Weight | 23.77 | Normal weight | Moderately active | High | Poorly hydrated |
| 3 | Crystal | Solis | Femenino | 27 | 1.80 | 87.0 | no | no | 3 | Sometimes | ... | 2 | 0 | Frequently | Walking | Overweight_Level_I | 26.85 | Overweight | Moderately active | Moderate | Poorly hydrated |
| 4 | Stephanie | Vasquez | Femenino | 22 | 1.78 | 89.8 | no | no | 1 | Sometimes | ... | 0 | 0 | Sometimes | Public_Transportation | Overweight_Level_II | 28.34 | Overweight | Sedentary | Moderate | Poorly hydrated |
5 rows × 23 columns
In [24]:
#Visualizaciones con Matplotlib
import matplotlib.pyplot as plt
tabla_contingencia = pd.crosstab(df_unido['Genero'], df_unido['Obesidad'])
tabla_contingencia.plot(kind='bar', figsize=(10, 6))
plt.title('Distribución de Obesidad por Genero')
plt.xlabel('Genero')
plt.ylabel('Peso')
plt.xticks(rotation=0)
plt.legend(title='Obesidad')
plt.grid(True)
plt.show()
In [25]:
# Verificar y convertir los datos a tipo numérico si es necesario
df_unido['Peso'] = pd.to_numeric(df_unido['Peso'], errors='coerce')
df_unido['Estatura'] = pd.to_numeric(df_unido['Estatura'], errors='coerce')
In [26]:
# Definir los grupos de estatura
bins = [1.50, 1.60, 1.70, 1.80, 1.90, 2.00]
labels = ['1.50-1.59', '1.60-1.69', '1.70-1.79', '1.80-1.89', '1.90-1.99']
df_unido['GrupoEstatura'] = pd.cut(df_unido['Estatura'], bins=bins, labels=labels, right=False)
# Calcular el peso promedio por grupo de estatura
peso_promedio_por_grupo = df_unido.groupby('GrupoEstatura')['Peso'].mean().reset_index()
import matplotlib.pyplot as plt
# Configurar el tamaño del gráfico
plt.figure(figsize=(10, 6))
# Crear el gráfico de barras
plt.bar(peso_promedio_por_grupo['GrupoEstatura'], peso_promedio_por_grupo['Peso'], color='skyblue')
# Configuración del gráfico
plt.title('Peso Promedio por Grupo de Estatura')
plt.xlabel('Grupo de Estatura (m)')
plt.ylabel('Peso Promedio (kg)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Mostrar el gráfico
plt.show()
C:\Users\Josshy\AppData\Local\Temp\ipykernel_12900\631464415.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
peso_promedio_por_grupo = df_unido.groupby('GrupoEstatura')['Peso'].mean().reset_index()
In [27]:
# Asegurarse de que las columnas que se van a utilizar sean numéricas
df_unido['Edad'] = pd.to_numeric(df_unido['Edad'], errors='coerce')
df_unido['Estatura'] = pd.to_numeric(df_unido['Estatura'], errors='coerce')
df_unido['Peso'] = pd.to_numeric(df_unido['Peso'], errors='coerce')
# Eliminar filas con valores NaN en las columnas numéricas
df_unido = df_unido.dropna(subset=['Edad', 'Estatura', 'Peso'])
In [32]:
# Visualizaciones con Bokeh
output_notebook()
# Histograma de edades
hist, edges = np.histogram(df_unido['Edad'], bins=20)
p1 = figure(title="Distribución de Edades", x_axis_label='Edad', y_axis_label='Peso')
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="blue", line_color="black")
# Gráfico de dispersión de Peso vs Estatura
p2 = figure(title="Relación entre Estatura y Peso", x_axis_label='Estatura (cm)', y_axis_label='Peso (kg)')
p2.circle(df_unido['Estatura'], df_unido['Peso'], size=10, color="navy", alpha=0.5)
grid = gridplot([[p1, p2]])
show(grid)
In [33]:
import pygwalker as pyg
# Crear un objeto Pygwalker
pyg.walk(df_unido)
Box(children=(HTML(value='<div id="ifr-pyg-00061979fc0b5d64CSqMOJ7U5Bp9oQDF" style="height: auto">\n <head>…
Loading Graphic-Walker UI...
Out[33]:
<pygwalker.api.pygwalker.PygWalker at 0x1a4843688e0>
In [ ]: